import pandas as pd
import datetime
import numpy as np
import statsmodels.api as sm
import folium
import requests
import itertools
# For Graphics
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
data = pd.read_csv('pythons.csv')
data.shape
data.columns
col_nan = list(data.columns[data.isnull().any()])
null_col = data[col_nan]
null_col.isnull().sum()
data.dtypes
We create a pivot tables, where the rows are years and columns, value will be counts:
data['ObsYear'] = pd.DatetimeIndex(data['ObsDate']).year
data['ObsMonth'] = pd.DatetimeIndex(data['ObsDate']).month
data['ObsDay'] = pd.DatetimeIndex(data['ObsDate']).day
data_obs_date = pd.pivot_table(
data,
index = ['ObsYear','ObsMonth'],
values = 'objectid',
aggfunc = len
)
obs_date_pivot = data_obs_date.reset_index().sort_values(by = ['ObsYear','ObsMonth'], ascending = True)
obs_date_pivot.head(10)
obs_date_pivot['Obs_Year-Month'] = pd.to_datetime(obs_date_pivot['ObsYear'].astype(str) + obs_date_pivot['ObsMonth'].astype(str), format='%Y%m')
obs_date_pivot.head(10)
plt.figure(figsize=(20,8))
obs_date = sns.lineplot(x='Obs_Year-Month', y = 'objectid', data=obs_date_pivot, color="b")
obs_date.set_xlabel('Obs_Year-Month')
obs_date.set_ylabel("Number of sightings")
There is a tendecy that the seens have been growing, at the end there is a reduction it could be because there is recent data so there is less inputs
counties_count = data['Location'].value_counts().rename_axis('Counties').reset_index(name='Counts')
counties = counties_count['Counties'].str.split(',', expand=True)[0]
counties_count = counties_count[['Counts']]
counties_count['Counties'] = counties
counties_count[['Counties', 'Counts']].head(3)
sns.barplot(x='Counties', y='Counts', data=counties_count[['Counties', 'Counts']].head(4),
label="Total", color="b")
coordinates = pd.DataFrame(data[['Latitude', 'Longitude']])
values = pd.DataFrame(coordinates.loc[(coordinates['Latitude'] < 32.5) & (coordinates['Longitude'] < -50)]) #coordinates.loc[coordinates['Longitude'] < -50]
sns.jointplot(x="Longitude", y="Latitude", data=values);
closest = pd.DataFrame(coordinates.loc[(coordinates['Latitude'] < 29) & (coordinates['Longitude'] < -80)])
sns.jointplot(x="Longitude", y="Latitude", data=closest);
sight = closest.groupby(['Latitude','Longitude']).count().reset_index()
map1 = folium.Map(
location=[43.12627,-77.53985],
tiles='cartodbpositron',
zoom_start=2,
)
sight.apply(lambda row:folium.CircleMarker(location=[row["Latitude"], row["Longitude"]]).add_to(map1), axis=1)
map1
data_obs_year = pd.pivot_table(
data,
index = ['ObsYear'],
values = 'objectid',
aggfunc = len
)
obs_year_pivot = data_obs_year.reset_index().sort_values(by = 'objectid', ascending = False)
top_years = list(set(obs_year_pivot['ObsYear'].head(8)))
top_years
top_years_data = data[data.ObsYear.isin(top_years)]
top_years_data.head()
data_month_topyears = pd.pivot_table(
top_years_data,
index = ['ObsMonth'],
columns = ['ObsYear'],
values = ['objectid'],
aggfunc = 'count',
margins = True
)
data_month_topyears.columns = data_month_topyears.columns.droplevel(0) # We got multiindex, we dropped one level
print(data_month_topyears)
data_month_topyears = data_month_topyears.head(12)
data_month_topyears
data_month_topyears.plot(kind = 'line', figsize=(8, 4), y = ['All',2017,2018])
data_month_topyears[top_years].plot(kind = 'bar', figsize=(8, 6))
During the years we can observe that the month of August and July we can seen more burmese python, then jump to the winter month. There is a rare tendency, so we start selecting the data by year:,
https://myfwc.com/wildlifehabitats/nonnatives/python/faqs/
During warm months they like to stay in the water, but they are more active at night; most pythons are found crossing roads during late hours.
During cooler months you can find them outside the water; they move away from the vegetation edges later on the day.
February, March, April these are the breeding month, they stay close to the nest and water edges.
So yes, there you can see that there is a seasonal tendecy that is why you can find more during the warm and cooler months.
We are going to analyse [20`5 - 2018] period
cyclical_analysis = data[data.ObsYear.isin(list(range(2008, 2020)))]
cyclical_analysis.head()
print(cyclical_analysis.shape)
cyclical_analysis['datetime'] = pd.to_datetime(cyclical_analysis['ObsDate'])
cyclical_analysis = cyclical_analysis.set_index('datetime')
cyclical_analysis.drop(['ObsDate'], axis=1, inplace=True)
print(cyclical_analysis.shape)
test = cyclical_analysis.groupby('datetime')[['objectid']].count()
test.head(20)
a = test.resample('14D').count()
a.head(20)
a = a.replace(0, None)
a.reset_index().plot(x='datetime', y='objectid', figsize =(20,10))
from statsmodels.tsa.seasonal import seasonal_decompose
_ = seasonal_decompose(a).plot()
Definetly the sigts are cyclical between the month; but yearly there is a trend, increasing from 2014 to 2017 when it reach its plateau, being lower 2018 and 2019.
b = a['objectid']
train = b[:-23]
test = b[-23:]
print(train.head(), test.head())
#Starting on 2008
from statsmodels.tsa.arima_model import ARMA
model = ARMA(
train,
order = (2,1)
).fit()
model.summary()
predictions = model.predict(
start = len(train),
end = len(train) + len(test) -1
)
obs_pre = pd.DataFrame({
'observed' : test,
'predicted' : predictions
})
obs_pre
from statsmodels.tools.eval_measures import rmse
rmse(test, predictions)
pd.plotting.autocorrelation_plot(test)
model.plot_predict(
start = len(train),
end = len(train) + len(test) +2
)
prediction2019 = predictions = model.predict(
start = len(train),
end = len(train) + len(test)+2
)
prediction2019.sum().round()
obs_pre.sum().round()
import itertools
p = d = q = range(0, 2)
pdq = list(itertools.product(p, d, q))
seasonal_pdq = [(x[0], x[1], x[2], 26) for x in list(itertools.product(p, d, q))]
print('Examples of parameter combinations for Seasonal ARIMA...')
print('SARIMAX: {} x {}'.format(pdq[1], seasonal_pdq[1]))
print('SARIMAX: {} x {}'.format(pdq[1], seasonal_pdq[2]))
print('SARIMAX: {} x {}'.format(pdq[2], seasonal_pdq[3]))
print('SARIMAX: {} x {}'.format(pdq[2], seasonal_pdq[4]))
This step is parameter Selection for our observations ARIMA Time Series Model. Our goal here is to use a “grid search” to find the optimal set of parameters that yields the best performance for our model.
for param in pdq:
for param_seasonal in seasonal_pdq:
try:
mod = sm.tsa.statespace.SARIMAX(a,
order=param,
seasonal_order=param_seasonal,
enforce_stationarity=False,
enforce_invertibility=False)
results = mod.fit()
print('ARIMA{}x{}26 - AIC:{}'.format(param, param_seasonal, results.aic))
except:
continue
mod = sm.tsa.statespace.SARIMAX(a,
order=(1, 0, 1),
seasonal_order=(0, 1, 1, 26),
enforce_stationarity=False,
enforce_invertibility=False)
arima_results = mod.fit()
# print(arima_results.summary().tables[1])
arima_results.summary()
The above output suggests that SARIMA(1, 0, 1)x(0, 1, 1, 26)26 yields the lowest AIC value of 1237.55. Therefore we should consider this to be optimal option.
arima_results.plot_diagnostics(figsize=(16, 8))
plt.show()
It is not perfect, however, our model diagnostics suggests that the model residuals are near normally distributed.
pred = arima_results.get_prediction(start = len(train),end = len(train) + len(test) -1, dynamic=False)
pred_ci = pred.conf_int()
# ax = ['2014':].plot(label='observed')
# pred.predicted_mean.plot(ax=ax, label='One-step ahead Forecast', alpha=.7, figsize=(14, 7))
# ax.fill_between(pred_ci.index,
# pred_ci.iloc[:, 0],
# pred_ci.iloc[:, 1], color='k', alpha=.2)
# ax.set_xlabel('Date')
# ax.set_ylabel('Furniture Sales')
# plt.legend()
# plt.show()
pred
#ARMA
ARMA2019_pred = model.predict(start = len(train),end = len(train) + len(test) +2) #Observ and ARMA Table
ARMA2019 = pd.DataFrame({
'observed' : test,
'ARMA' : ARMA2019_pred
})
# ARIMA
ARIMA2019 = arima_results.get_prediction(start = len(train),end = len(train) + len(test) +2, dynamic=False)
ARI_MAX_MIN = ARIMA2019.conf_int() # MAX and Min Arima Table
ARI_MEAN = ARIMA2019.predicted_mean
ARMA2019['ARIMA2019'] = ARI_MEAN
ARMA2019.sum()
ax = ARMA2019.plot()
ARI_MEAN.plot(ax=ax, label='ARIMA', alpha=.7, figsize=(14, 7))
# ax.fill_between(ARI_MAX_MIN.index,
# ARI_MAX_MIN.iloc[:, ],
# ARI_MAX_MIN.iloc[:, 1], color='k', alpha=.2)
ax.set_xlabel('Date')
ax.set_ylabel('Furniture Sales')
plt.legend()
plt.show()
url = "https://api.inaturalist.org/v1/observations?verifiable=true&order_by=observations.id&order=desc&spam=false&place_id=21&taxon_id=238252&locale=en-US&per_page=100"
response = requests.get(url)
results = response.json()
inaturalist = pd.DataFrame(results)
inaturalist.head(10)
from pandas.io.json import json_normalize
flattened_data = json_normalize(inaturalist['results'])
flattened_data
flattened_data.shape[0]
Since we have only 83 observation, we should not use iNaturalist to get the public to engage.
users = pd.pivot_table(
data = flattened_data,
index='user.login',
values= 'id',
aggfunc=len
)
top_active_users = users.reset_index().sort_values(by = 'id', ascending = False).head(5)
top_active_users
plt.figure(figsize=(10,8))
ax = sns.barplot(x="id", y="user.login", data=top_active_users, color="b")
ax.set_xlabel("Sighting pythons")
ax.set_ylabel("Users")
list(flattened_data.columns)
obs = flattened_data[['id', 'reviewed_by', 'quality_grade']]
obs = obs[obs['quality_grade'] == 'research']
obs.head()
dfToList = obs['reviewed_by'].tolist()
flat_list = []
for sublist in dfToList:
for item in sublist:
flat_list.append(item)
idenfiying_pythons = pd.DataFrame(flat_list)[0].value_counts().head(5)
plt.figure(figsize=(10,5))
ax = sns.barplot(idenfiying_pythons.index, idenfiying_pythons.values, color="b")
ax.set_xlabel("Users")
ax.set_ylabel("Idenfiying Pythons")
# Getting combos
from itertools import combinations
df = flattened_data[['id', 'reviewed_by']]
pd.concat([pd.Series(row['id'], str(row['reviewed_by']).split(','))
for _, row in df.iterrows()]).reset_index()
df_combos = pd.concat([pd.Series(row['id'], str(row['reviewed_by']).split(','))
for _, row in df.iterrows()]).reset_index()
df_combos.columns = ['id', 'reviewed_by']
df_combos['id'] = combos['id'].str.replace('[', '')
df_combos['id'] = combos['id'].str.replace(']', '')
combos = list(df_combos.itertuples(index=False))
len(combos) #498 pairs
G.add_edges_from(combos)
plt.figure(3,figsize=(15,15))
nx.draw_kamada_kawai(G,
with_labels = True,
node_size = 200,
font_size = 10,
node_color = '#29b6f6',
alpha = 1
)
# Top 5 Influencers
sorted(nx.degree_centrality(G), key=nx.degree_centrality(G).get, reverse = True)[:5] #Measuse of popularity base on a node's degree
# Cleaning user.name missing values from flattened_data
flattened_data_cleaned = flattened_data.copy()
flattened_data.dropna(subset=['user.name'], inplace = True)
flattened_data_cleaned.shape[0]
# Cleaning Reporter from data
data_reporter = data[['Reporter']]
data_reporter['Name1'] = data_reporter['Reporter'].str.split(" ", expand = True)[0]
data_reporter['Name2'] = data_reporter['Reporter'].str.split(" ", expand = True)[0] + ' ' + data_reporter['Reporter'].str.split(" ", expand = True)[1]
val1 = np.intersect1d(flattened_data['user.name'], data_reporter['Name1'])
val1
val2 = np.intersect1d(flattened_data['user.name'], data_reporter['Name2'])
val2
print("Users/Reporters overlap:")
for val in val1:
print(val)
for val in val2:
print(val)